Package org.terrier.structures.indexing.singlepass.hadoop

Source Code of org.terrier.structures.indexing.singlepass.hadoop.HadoopRunWriter

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.uk
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is HadoopRunWriter.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Richard McCreadie <richardm{a.}dcs.gla.ac.uk> (original author)
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
*  
*/
package org.terrier.structures.indexing.singlepass.hadoop;

import java.io.IOException;

import org.apache.hadoop.mapred.OutputCollector;

import org.terrier.compression.MemorySBOS;
import org.terrier.structures.indexing.singlepass.Posting;
import org.terrier.structures.indexing.singlepass.RunWriter;

/** RunWriter for the MapReduce indexer. Provides functionality to write
* term posting lists out to the map task outputcollector during a MapReduce
* indexing job. Map and flush numbers are also passed with the posting list to
* allow for docids to be corrected later from side-effect files.
* @author Richard McCreadie and Craig Macdonald
**/
public class HadoopRunWriter extends RunWriter {
  /** output collector of Map task */
  protected OutputCollector<SplitEmittedTerm, MapEmittedPostingList> outputCollector = null;
  /** map task id that is being flushed */
  protected String mapId;
  /** flushNo is the number of times this map task is being flushed */
  protected int flushNo;
  /** The id for this split within the map task that is being flushed */
  protected int splitId;
 
  /** Create a new HadoopRunWriter, specifying the output collector of the map task
   * the run number and the flush number.
   * @param _outputCollector where to emit the posting lists to
   * @param _mapId the task id of the map currently being processed
   * @param _flushNo the number of times that this map task has flushed
   */
  public HadoopRunWriter(OutputCollector<SplitEmittedTerm, MapEmittedPostingList> _outputCollector,
      String _mapId, int _splitId, int _flushNo)
  {
    this.outputCollector = _outputCollector;
    this.mapId = _mapId;
    this.flushNo = _flushNo;
    this.splitId = _splitId;
    this.info = "HadoopRunWriter(Map "+ mapId +", flush "+flushNo+")";
  }
 
  @Override
  public void beginWrite(int maxSize, int size) throws IOException
  {}
 
  /** Write the posting to the output collector
   */
  @Override
  public void writeTerm(final String term, final Posting post) throws IOException
 
    final MemorySBOS Docs = post.getDocs();
    Docs.pad();
    //get the posting array buffer
    byte[] buffer = new byte[Docs.getMOS().getPos()+1];
    System.arraycopy(Docs.getMOS().getBuffer(), 0,
        buffer, 0,
        Math.min(Docs.getMOS().getBuffer().length, Docs.getMOS().getPos()+1));
   
    //emit the term and its posting list
    outputCollector.collect(
        SplitEmittedTerm.createNewTerm(term, splitId, flushNo),
        MapEmittedPostingList.create_Hadoop_WritableRunPostingData(
            mapId,
            flushNo,
            splitId,
            buffer,
            post.getDocF(), post.getTF()));
  }
 
  @Override
  public void finishWrite() throws IOException
  {}
 
  /** This RunWriter does not require that the output be sorted.
    */
  @Override
  public boolean writeSorted()
  {
    return false;
  }
}
TOP

Related Classes of org.terrier.structures.indexing.singlepass.hadoop.HadoopRunWriter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.